{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2021-07-05 17:43:35--  https://raw.githubusercontent.com/teropa/nlp/master/resources/corpora/gutenberg/austen-emma.txt\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 887071 (866K) [text/plain]\n",
      "Saving to: ‘austen-emma.txt’\n",
      "\n",
      "austen-emma.txt     100%[===================>] 866.28K  1.41MB/s    in 0.6s    \n",
      "\n",
      "2021-07-05 17:43:36 (1.41 MB/s) - ‘austen-emma.txt’ saved [887071/887071]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget https://raw.githubusercontent.com/teropa/nlp/master/resources/corpora/gutenberg/austen-emma.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers import ByteLevelBPETokenizer\n",
    "import tensorflow as tf\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers.models import BPE\n",
    "from tokenizers import Tokenizer\n",
    "from tokenizers.decoders import ByteLevel as ByteLevelDecoder\n",
    "from tokenizers.normalizers import NFKC, Sequence, Lowercase\n",
    "from tokenizers.pre_tokenizers import ByteLevel\n",
    "from tokenizers.trainers import BpeTrainer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = Tokenizer(BPE())\n",
    "tokenizer.normalizer = Sequence([\n",
    "    Lowercase()\n",
    "])\n",
    "tokenizer.pre_tokenizer = ByteLevel()\n",
    "tokenizer.decoder = ByteLevelDecoder()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer = BpeTrainer(vocab_size=50000, inital_alphabet=ByteLevel.alphabet(), special_tokens=[\n",
    "            \"<s>\",\n",
    "            \"<pad>\",\n",
    "            \"</s>\",\n",
    "            \"<unk>\",\n",
    "            \"<mask>\"\n",
    "        ])\n",
    "tokenizer.train([\"austen-emma.txt\"], trainer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir tokenizer_gpt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.save(\"tokenizer_gpt/tokenizer.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import GPT2TokenizerFast, GPT2Config, TFGPT2LMHeadModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "tokenizer_gpt = GPT2TokenizerFast.from_pretrained(\"tokenizer_gpt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer_gpt.add_special_tokens({\n",
    "  \"eos_token\": \"</s>\",\n",
    "  \"bos_token\": \"<s>\",\n",
    "  \"unk_token\": \"<unk>\",\n",
    "  \"pad_token\": \"<pad>\",\n",
    "  \"mask_token\": \"<mask>\"\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer_gpt.eos_token_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0, 265, 157, 56, 2]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer_gpt.encode(\"<s> this is </s>\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = GPT2Config(\n",
    "  vocab_size=tokenizer_gpt.vocab_size,\n",
    "  bos_token_id=tokenizer_gpt.bos_token_id,\n",
    "  eos_token_id=tokenizer_gpt.eos_token_id\n",
    ")\n",
    "model = TFGPT2LMHeadModel(config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GPT2Config {\n",
       "  \"activation_function\": \"gelu_new\",\n",
       "  \"attn_pdrop\": 0.1,\n",
       "  \"bos_token_id\": 0,\n",
       "  \"embd_pdrop\": 0.1,\n",
       "  \"eos_token_id\": 2,\n",
       "  \"gradient_checkpointing\": false,\n",
       "  \"initializer_range\": 0.02,\n",
       "  \"layer_norm_epsilon\": 1e-05,\n",
       "  \"model_type\": \"gpt2\",\n",
       "  \"n_ctx\": 1024,\n",
       "  \"n_embd\": 768,\n",
       "  \"n_head\": 12,\n",
       "  \"n_inner\": null,\n",
       "  \"n_layer\": 12,\n",
       "  \"n_positions\": 1024,\n",
       "  \"resid_pdrop\": 0.1,\n",
       "  \"summary_activation\": null,\n",
       "  \"summary_first_dropout\": 0.1,\n",
       "  \"summary_proj_to_labels\": true,\n",
       "  \"summary_type\": \"cls_index\",\n",
       "  \"summary_use_proj\": true,\n",
       "  \"transformers_version\": \"4.3.2\",\n",
       "  \"use_cache\": true,\n",
       "  \"vocab_size\": 11750\n",
       "}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"austen-emma.txt\", \"r\", encoding='utf-8') as f:\n",
    "    content = f.readlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "content_p = []\n",
    "for c in content:\n",
    "    if len(c)>10:\n",
    "        content_p.append(c.strip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "content_p = \" \".join(content_p)+tokenizer_gpt.eos_token"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_content = tokenizer_gpt.encode(content_p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "examples = []\n",
    "block_size = 100\n",
    "BATCH_SIZE = 12\n",
    "BUFFER_SIZE = 1000\n",
    "for i in range(0, len(tokenized_content)):\n",
    "    examples.append(tokenized_content[i:i + block_size])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = [] \n",
    "labels = [] \n",
    "for example in examples: \n",
    "    train_data.append(example[:-1]) \n",
    "    labels.append(example[1:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# change 1000 if you want to train on full data\n",
    "dataset = tf.data.Dataset.from_tensor_slices((train_data[:1000], labels[:1000]))\n",
    "dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).\n",
      "WARNING:tensorflow:AutoGraph could not transform <bound method Socket.send of <zmq.sugar.socket.Socket object at 0x7ffa0fd3b820>> and will run it as-is.\n",
      "Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.\n",
      "Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method\n",
      "To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert\n",
      "WARNING: AutoGraph could not transform <bound method Socket.send of <zmq.sugar.socket.Socket object at 0x7ffa0fd3b820>> and will run it as-is.\n",
      "Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.\n",
      "Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method\n",
      "To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert\n",
      "WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/meysam/anaconda3/lib/python3.8/site-packages/tensorflow/python/framework/indexed_slices.py:431: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).\n",
      "WARNING:tensorflow:The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.\n",
      "83/83 [==============================] - 212s 3s/step - loss: 6.7420 - logits_loss: 6.7420 - logits_accuracy: 0.0992 - past_key_values_1_accuracy: 0.0026 - past_key_values_2_accuracy: 0.0021 - past_key_values_3_accuracy: 0.0018 - past_key_values_4_accuracy: 0.0018 - past_key_values_5_accuracy: 0.0027 - past_key_values_6_accuracy: 0.0022 - past_key_values_7_accuracy: 0.0033 - past_key_values_8_accuracy: 0.0032 - past_key_values_9_accuracy: 0.0034 - past_key_values_10_accuracy: 0.0018 - past_key_values_11_accuracy: 0.0039 - past_key_values_12_accuracy: 0.0019\n"
     ]
    }
   ],
   "source": [
    "# increase number of epochs for higher accuracy and lower loss\n",
    "num_epoch = 1\n",
    "history = model.fit(dataset, epochs=num_epoch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate(start):  \n",
    "    input_token_ids = tokenizer_gpt.encode(start, return_tensors='tf')  \n",
    "    output = model.generate(  \n",
    "        input_token_ids,  \n",
    "        max_length = 10,  \n",
    "        num_beams = 5,  \n",
    "        temperature = 0.7,  \n",
    "        no_repeat_ngram_size=2,  \n",
    "        num_return_sequences=1  \n",
    "    )  \n",
    "    return tokenizer_gpt.decode(output[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'  of her. the had, and the,'"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "generate(\" \")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to 2 (first `eos_token_id`) to generate sequence\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'wetson was very good, and the,'"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "generate(\"wetson was very good\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir my_gpt-2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_pretrained(\"my_gpt-2/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "All model checkpoint layers were used when initializing TFGPT2LMHeadModel.\n",
      "\n",
      "All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at my_gpt-2/.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.\n"
     ]
    }
   ],
   "source": [
    "model_reloaded = TFGPT2LMHeadModel.from_pretrained(\"my_gpt-2/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import WEIGHTS_NAME, CONFIG_NAME, TF2_WEIGHTS_NAME, AutoModel, AutoTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('tokenizer_gpt_auto/tokenizer_config.json',\n",
       " 'tokenizer_gpt_auto/special_tokens_map.json',\n",
       " 'tokenizer_gpt_auto/vocab.json',\n",
       " 'tokenizer_gpt_auto/merges.txt',\n",
       " 'tokenizer_gpt_auto/added_tokens.json')"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer_gpt.save_pretrained(\"tokenizer_gpt_auto/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "All TF 2.0 model weights were used when initializing GPT2Model.\n",
      "\n",
      "Some weights of GPT2Model were not initialized from the TF 2.0 model and are newly initialized: ['h.0.attn.bias', 'h.0.attn.masked_bias', 'h.1.attn.bias', 'h.1.attn.masked_bias', 'h.2.attn.bias', 'h.2.attn.masked_bias', 'h.3.attn.bias', 'h.3.attn.masked_bias', 'h.4.attn.bias', 'h.4.attn.masked_bias', 'h.5.attn.bias', 'h.5.attn.masked_bias', 'h.6.attn.bias', 'h.6.attn.masked_bias', 'h.7.attn.bias', 'h.7.attn.masked_bias', 'h.8.attn.bias', 'h.8.attn.masked_bias', 'h.9.attn.bias', 'h.9.attn.masked_bias', 'h.10.attn.bias', 'h.10.attn.masked_bias', 'h.11.attn.bias', 'h.11.attn.masked_bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "model = AutoModel.from_pretrained(\"my_gpt-2/\", from_tf = True) \n",
    "tokenizer = AutoTokenizer.from_pretrained(\"tokenizer_gpt_auto\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:root] *",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
